scale_data = True
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from math import ceil
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
df = pd.read_excel('Covid May 2020 data.xlsx', sheet_name='Data')
df.tail()
| age | bmi | HbA1c_mmol_mol | has_asthma | has_high_blood_pressure | result | |
|---|---|---|---|---|---|---|
| 10444 | 94 | 33.8 | 67 | 0 | 1 | died |
| 10445 | 90 | 23.3 | 100 | 0 | 1 | hospitalised |
| 10446 | 83 | 28.6 | 33 | 1 | 0 | died |
| 10447 | 31 | 18.2 | 80 | 0 | 0 | mild illness |
| 10448 | 26 | 26.9 | 40 | 1 | 0 | mild illness |
# Scale data
def scale_dataframe(df):
scaler = MinMaxScaler()
df['age'] = scaler.fit_transform(df[['age']])
df['bmi'] = scaler.fit_transform(df[['bmi']])
df['HbA1c_mmol_mol'] = scaler.fit_transform(df[['HbA1c_mmol_mol']])
return df
if scale_data:
df = scale_dataframe(df)
df.head()
| age | bmi | HbA1c_mmol_mol | has_asthma | has_high_blood_pressure | result | |
|---|---|---|---|---|---|---|
| 0 | 0.243902 | 0.263158 | 0.472222 | 1 | 0 | mild illness |
| 1 | 0.829268 | 0.257895 | 0.347222 | 0 | 0 | hospitalised |
| 2 | 0.451220 | 0.600000 | 0.736111 | 0 | 1 | hospitalised |
| 3 | 0.719512 | 0.110526 | 0.444444 | 0 | 1 | died |
| 4 | 0.060976 | 0.963158 | 0.625000 | 0 | 0 | mild illness |
# Replace targets with ints
result_map = {'mild illness': 0, 'hospitalised': 1, 'died': 2}
result_map_reverse = {value: key for key, value in result_map.items()}
df['result'] = df['result'].replace(result_map)
# Set X, y and convert them to np arrays
X = df.drop('result', axis=1)
num_cols = X.shape[1]
y = df['result']
num_outputs = y.nunique()
X = X.values
y = y.values
h1_layers = ceil((num_cols + num_outputs)/2)
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
# Convert X features to float tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
# Convert y labels to long tensors
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)
class Model(nn.Module):
def __init__(self, in_features=num_cols, h1=h1_layers, out_features=num_outputs):
super().__init__()
self.fc1 = nn.Linear(in_features, h1)
self.out = nn.Linear(h1, out_features)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.out(x)
return x
torch.manual_seed(49)
model = Model()
losses = []
epochs = 250
loss_function = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
for i in range(epochs):
y_pred = model.forward(X_train) # predicted results
loss = loss_function(y_pred, y_train) # predicted values vs the y_train
losses.append(loss.detach().numpy())
if i % 10 == 0:
print(f'At epoch {i} loss was: {loss}')
optimiser.zero_grad()
loss.backward()
optimiser.step()
At epoch 0 loss was: 1.1762192249298096 At epoch 10 loss was: 1.0961493253707886 At epoch 20 loss was: 1.0698022842407227 At epoch 30 loss was: 1.0202093124389648 At epoch 40 loss was: 0.9572418928146362 At epoch 50 loss was: 0.8754114508628845 At epoch 60 loss was: 0.7935022711753845 At epoch 70 loss was: 0.720582902431488 At epoch 80 loss was: 0.6599651575088501 At epoch 90 loss was: 0.6124850511550903 At epoch 100 loss was: 0.5760200023651123 At epoch 110 loss was: 0.5480552315711975 At epoch 120 loss was: 0.5266116857528687 At epoch 130 loss was: 0.510050356388092 At epoch 140 loss was: 0.49717631936073303 At epoch 150 loss was: 0.48710158467292786 At epoch 160 loss was: 0.47918781638145447 At epoch 170 loss was: 0.47297823429107666 At epoch 180 loss was: 0.4681414067745209 At epoch 190 loss was: 0.46439245343208313 At epoch 200 loss was: 0.46149691939353943 At epoch 210 loss was: 0.45926496386528015 At epoch 220 loss was: 0.45754602551460266 At epoch 230 loss was: 0.4562242329120636 At epoch 240 loss was: 0.4552067220211029
plt.plot(range(epochs), losses)
plt.ylabel("Loss")
plt.xlabel('Epoch')
Text(0.5, 0, 'Epoch')
# Determine feature importance
def permutation_feature_importance(model, X_test, y_test, column_names, n_iterations=10):
baseline_accuracy = evaluate_accuracy(model, X_test, y_test)
print(f'Accuracy (%): {baseline_accuracy*100:.2f}')
feature_importances = np.zeros(X_test.shape[1])
for i in range(X_test.shape[1]):
accuracy_scores = []
for _ in range(n_iterations):
X_test_permuted = X_test.detach().clone()
X_test_permuted[:, i] = X_test_permuted[:, i][torch.randperm(X_test.shape[0])]
accuracy = evaluate_accuracy(model, X_test_permuted, y_test)
accuracy_scores.append(accuracy)
feature_importances[i] = baseline_accuracy - np.mean(accuracy_scores)
return feature_importances
# Evaluate model (on test dataset)
def evaluate_accuracy(model, X_test, y_test):
with torch.no_grad(): # deactivate gradient computations (required in training but not testing)
y_pred = model(X_test)
predicted = y_pred.argmax(dim=1) # convert predicted probabilities to class labels
accuracy = (predicted == y_test).sum().item() / len(y_test)
return accuracy
feature_importances = permutation_feature_importance(model, X_test, y_test, df.columns.tolist())
importance_dict = {}
for column, importance in zip(df.columns.tolist(), feature_importances):
print(f"'{column}' importance score: {importance}")
importance_dict[column] = importance
Accuracy (%): 79.47 'age' importance score: 0.4310526315789474 'bmi' importance score: 0.13339712918660296 'HbA1c_mmol_mol' importance score: 0.002105263157894721 'has_asthma' importance score: 0.0011004784688994906 'has_high_blood_pressure' importance score: 0.0014354066985645675
# Create the bar chart
fig, ax = plt.subplots()
ax.bar(list(importance_dict.keys()), list(importance_dict.values()))
ax.set_ylabel("Importance Score")
plt.xticks(rotation=90)
plt.show()
df_new = pd.read_excel('Data to predict on.xlsx', sheet_name='Data')
df_new.tail()
| age | bmi | HbA1c_mmol_mol | has_asthma | has_high_blood_pressure | |
|---|---|---|---|---|---|
| 1125 | 23 | 34.5 | 36 | 0 | 1 |
| 1126 | 46 | 28.8 | 94 | 0 | 0 |
| 1127 | 72 | 28.7 | 67 | 1 | 1 |
| 1128 | 91 | 19.8 | 36 | 0 | 1 |
| 1129 | 69 | 16.7 | 81 | 0 | 0 |
if scale_data:
df_new_for_model = scale_dataframe(df_new.copy())
else:
df_new_for_model = df_new
predictions = []
for index, row in df_new_for_model.iterrows():
new_person = torch.tensor([row['age'],
row['bmi'],
row['HbA1c_mmol_mol'],
row['has_asthma'],
row['has_high_blood_pressure']],
dtype=torch.float32)
with torch.no_grad():
pred1 = model(new_person)
max_index = pred1.argmax().item()
result = result_map_reverse[max_index]
predictions.append(result)
df_new['prediction'] = predictions
df_new.head()
| age | bmi | HbA1c_mmol_mol | has_asthma | has_high_blood_pressure | prediction | |
|---|---|---|---|---|---|---|
| 0 | 30 | 32.6 | 51 | 1 | 0 | mild illness |
| 1 | 40 | 20.5 | 89 | 0 | 0 | mild illness |
| 2 | 94 | 21.0 | 34 | 1 | 0 | died |
| 3 | 79 | 21.7 | 36 | 0 | 1 | hospitalised |
| 4 | 33 | 32.5 | 40 | 0 | 1 | mild illness |
torch.save(model.state_dict(), 'Covid Multi-Class.pt')